
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
con = sqlite3.connect(r"D:\AppliedAI\AAIC_Course_handouts\11_Amazon Fine Food Reviews\amazon-fine-food-reviews\database.sqlite")
data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3""",con)
# Change Score with 1 n 2 as -ve and 4 n 5 as +ve
def chng_to_0_or_1 (Score):
if Score ==4 or Score ==5:
return 1
elif Score ==1 or Score ==2:
return 0
else:# Thus in case by some mistake any data is their with rating 6 or 7 etc due to some error is removed
pass
currentScore = data["Score"]
new_Score = currentScore.map(chng_to_0_or_1)
data["Score"] = new_Score
print ("Number of data points available")
print (data.shape)#Gives original number of data points available
#2 Data Cleaning a.) Getting rid of duplicates and b.) if helpnessdenominator < helpfulnessnumerator
data = data.drop_duplicates(subset = ["UserId","ProfileName","HelpfulnessNumerator","HelpfulnessDenominator","Score","Time","Summary","Text"], keep='first', inplace=False)
print ("Number of data points after removing duplicates")
print (data.shape)#Gives data points are deduplication
# Reference: Copied from above cell final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
data=data[data.HelpfulnessNumerator<=data.HelpfulnessDenominator]
print ("Number of data points after removing where HelpfulnessNumerator is more than HelpfulnessDenominator ")
print (data.shape)
#3 Preprocessing begins
#Convert to lower case,convert shortcut words to proper words, remove Special Character
#i) Convert to lower case:
data["Text"] = (data["Text"].str.lower())
data["Summary"] = (data["Summary"].str.lower())
#ii) Convert Shortcuts words to proper words
#List of Words are:https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
#Reference:https://stackoverflow.com/questions/39602824/pandas-replace-string-with-another-string
data['Text'] = data['Text'].replace({"ain't":"am not","amn't":"am not","aren't":"are not", \
"can't":"cannot","cause":"because","could've":"could have","couldn't":"could not","couldn't've":"could not have", \
"daren't":"dare not","daresn't":"dare not","dasn't":"dare not","didn't":"did not","doesn't":"does not", \
"don't":"do not","e'er":"ever","everyone's":"everyone is","finna":"fixing to","gimme":"give me", \
"gonna":"going to","gon't":"go not","gotta":"got to","hadn't":"had not","hasn't":"has not","haven't":"have not",\
"he'd":"he had","he'll":"he shall","he's":"he has","he've":"he have","how'd":"how did","how'll":"how will",\
"how're":"how are","how's":"how has","I'd":"I had","I'll":"I shall","I'm":"I am","I'm'a":"I am about to",\
"I'm'o":"I am going to","I've":"I have","isn't":"is not","it'd":"it would","it'll":"it shall","it's":"it has",\
"let's":"let us","mayn't":"may not","may've":"may have","mightn't":"might not","might've":"might have",\
"mustn't":"must not","mustn't've":"must not have","must've":"must have","needn't":"need not","ne'er":"never",\
"o'clock":"of the clock","o'er":"","ol'":"old","oughtn't":"ought not","shalln't":"shall not","shan't":"shall not",\
"she'd":"she had","she'll":"she shall","she's":"she is","should've":"should have","shouldn't":"should not",\
"shouldn't've":"should not have","somebody's":"somebody has","someone's":"someone has","something's":"something has",\
"that'll":"that will","that're":"that are","that's":"that is","that'd":"that would","there'd":"there had",\
"there'll":"there shall","there're":"there are","there's":"there is","these're":"hese are","they'd":"they had",\
"they'll":"they will","they're":"they are","they've":"they have","this's":"","those're":"those are","tis":"it is",\
"twas":"it was","wasn't":"was not","we'd":"we had","we'd've":"we would have","we'll":"we will","we're":"we are",\
"we've":"we have","weren't":"were not","what'd":"what did","what'll":"what will","what're":"what are","what's":"what is",\
"what've":"what have","when's":"when is","where'd":"where did","where're":"where are","where've":"where have",\
"which's":"which has","who'd":"who would","who'd've":"who would have","who'll":"who shall","who're":"who are",\
"who's":"who has","who've":"who have","why'd":"why did","why're":"why are","why's":"why has","won't":"will not",\
"would've":"would have","wouldn't":"would not","y'all":"you all","you'd":"you had","you'll":"you shall","you're":"you are",\
"you've":"you have"})
##############Lets do the same for summary Text##################################
data['Summary'] = data['Summary'].replace({"ain't":"am not","amn't":"am not","aren't":"are not", \
"can't":"cannot","cause":"because","could've":"could have","couldn't":"could not","couldn't've":"could not have", \
"daren't":"dare not","daresn't":"dare not","dasn't":"dare not","didn't":"did not","doesn't":"does not", \
"don't":"do not","e'er":"ever","everyone's":"everyone is","finna":"fixing to","gimme":"give me", \
"gonna":"going to","gon't":"go not","gotta":"got to","hadn't":"had not","hasn't":"has not","haven't":"have not",\
"he'd":"he had","he'll":"he shall","he's":"he has","he've":"he have","how'd":"how did","how'll":"how will",\
"how're":"how are","how's":"how has","I'd":"I had","I'll":"I shall","I'm":"I am","I'm'a":"I am about to",\
"I'm'o":"I am going to","I've":"I have","isn't":"is not","it'd":"it would","it'll":"it shall","it's":"it has",\
"let's":"let us","mayn't":"may not","may've":"may have","mightn't":"might not","might've":"might have",\
"mustn't":"must not","mustn't've":"must not have","must've":"must have","needn't":"need not","ne'er":"never",\
"o'clock":"of the clock","o'er":"","ol'":"old","oughtn't":"ought not","shalln't":"shall not","shan't":"shall not",\
"she'd":"she had","she'll":"she shall","she's":"she is","should've":"should have","shouldn't":"should not",\
"shouldn't've":"should not have","somebody's":"somebody has","someone's":"someone has","something's":"something has",\
"that'll":"that will","that're":"that are","that's":"that is","that'd":"that would","there'd":"there had",\
"there'll":"there shall","there're":"there are","there's":"there is","these're":"hese are","they'd":"they had",\
"they'll":"they will","they're":"they are","they've":"they have","this's":"","those're":"those are","tis":"it is",\
"twas":"it was","wasn't":"was not","we'd":"we had","we'd've":"we would have","we'll":"we will","we're":"we are",\
"we've":"we have","weren't":"were not","what'd":"what did","what'll":"what will","what're":"what are","what's":"what is",\
"what've":"what have","when's":"when is","where'd":"where did","where're":"where are","where've":"where have",\
"which's":"which has","who'd":"who would","who'd've":"who would have","who'll":"who shall","who're":"who are",\
"who's":"who has","who've":"who have","why'd":"why did","why're":"why are","why's":"why has","won't":"will not",\
"would've":"would have","wouldn't":"would not","y'all":"you all","you'd":"you had","you'll":"you shall","you're":"you are",\
"you've":"you have"})
########################################################################################
# iii) Remove Special Characters except alpahbets and numbers
#The reason i dont want to remove number people might write got five eggs as 5 eggs or vice versa and dont want to lose
#that information which could be useful
#Ref:https://stackoverflow.com/questions/33257344/how-to-remove-special-characers-from-a-column-of-dataframe-using-module-re
data["Text"]=data["Text"].map(lambda x: re.sub(r'[^a-zA-Z_0-9 -]', '', x))
data["Summary_copy"]=data["Summary"].map(lambda x: re.sub(r'[^a-zA-Z_0-9 -]', '', x))
#The Summary are usually so small if we remove few stopwords the meaning itself would be complely lost or chamge
# So let us see what all stopwords we have
#Ref:::::::::https://stackoverflow.com/questions/5511708/adding-words-to-nltk-stoplist
#https://chrisalbon.com/machine_learning/preprocessing_text/remove_stop_words/
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['would','could','br','<br>','<','>']
notstopwords = ['not','no','nor']
stopwords.extend(newStopWords)
stopwords = [word for word in stopwords if word not in notstopwords]
# iv) For now let us just go with flow will use default stopwords as creating our own stop words is very time consuming
#Rather will use n-gram stratergy to get rid of problem of stopwords removal changing the meaning of sentences
#Ref:https://stackoverflow.com/questions/43184364/python-remove-stop-words-from-pandas-dataframe-give-wrong-output
data["New_Text"]= data['Text'].apply(lambda x: [item for item in str.split(x) if item not in stopwords])
data["Summary"]= data['Summary_copy'].apply(lambda x: [item for item in str.split(x) if item not in stopwords])
#Ref:https://stackoverflow.com/questions/37347725/converting-a-panda-df-list-into-a-string/37347837
#we are creating new column New_summary so in case in future we need summary it is intact
data["New_Text"]=data["New_Text"].apply(' '.join)
data["Summary"]=data["Summary"].apply(' '.join)
# v) Now lets do Stemming
#https://stackoverflow.com/questions/48617589/beginner-stemming-in-pandas-produces-letters-not-stems
english_stemmer=SnowballStemmer('english', ignore_stopwords=True)
data["New_Text"] = data["New_Text"].apply(english_stemmer.stem)
data["Summary"] = data["Summary"].apply(english_stemmer.stem)
data["New_Text"] = data["New_Text"].astype(str)
data["Summary"] = data["Summary"].astype(str)
#vi) stemming without removing stop words
english_stemmer=SnowballStemmer('english', ignore_stopwords=True)
#https://stackoverflow.com/questions/34724246/attributeerror-float-object-has-no-attribute-lower
data["Text_with_stop"]=data["Text"].astype(str)
data["Summary"]=data["Summary"].astype(str)
data["Text_with_stop"]=data["Text_with_stop"].str.lower().map(english_stemmer.stem)
data["Summary"]=data["Summary"].str.lower().map(english_stemmer.stem)
data["Text_with_stop"]=data["Text_with_stop"].apply(''.join)
data["Summary"]=data["Summary"].apply(''.join)
data["Text_with_stop"] = data["Text_with_stop"].astype(str)
data["Summary"] = data["Summary"].astype(str)
print(data["Score"].value_counts())
print ("Thus we see there are 85% and 15% positive and negative reviews,thus a unbalanced dataset.So to create a balanced \
dataset we first copy negative dataset 6 times than we sample with same number of times as positive")
# Let include another feature which is the length of the text
data_neg = data[data["Score"] == 0]
data_pos = data[data["Score"] == 1]
data = pd.concat([data_pos,data_neg])
#https://stackoverflow.com/questions/46429033/how-do-i-count-the-total-number-of-words-in-a-pandas-dataframe-cell-and-add-thos
data["Text_length"]= (data["New_Text"].str.count(' ') + 1)
data["Summary_length"]= (data["Summary"].str.count(' ') + 1)
data["Time_formatted"]= pd.to_datetime(data["Time"])
data.sort_values(by=['Time_formatted'], inplace=True)
newdata = data.tail(40000)
newdata.sort_values(by=['Time_formatted'], inplace=True)
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import train_test_split
Y_40k = newdata['Score'].values
X_no_stop_40k = newdata['New_Text'].values
X_summary_40k = newdata ['Summary'].values
X_no_stop_train_40k, X_no_stop_test_40k, y_train_40k, y_test_40k = train_test_split(X_no_stop_40k, Y_40k, test_size=0.33, shuffle=False)
%time
from sklearn.feature_extraction.text import CountVectorizer
import math
bow_vect = CountVectorizer(ngram_range = (1,2),min_df = 7,max_features=9000)
bow_X_train_no_stop_40k = bow_vect.fit_transform(X_no_stop_train_40k)
bow_X_test_no_stop_40k = bow_vect.transform(X_no_stop_test_40k)
##################################################
###############################################
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, grid_search
from sklearn.grid_search import GridSearchCV
param_grid = {'max_depth':[1, 5, 10, 50, 100, 500, 1000],'min_samples_split':[5, 10, 100, 500,1000]}
grid_DT_BOW = GridSearchCV(DecisionTreeClassifier(class_weight='balanced'),param_grid,scoring='roc_auc',cv=10, verbose=2)
grid_DT_BOW.fit(bow_X_train_no_stop_40k,y_train_40k)
grid_DT_BOW.best_params_
besthyperpara_bow_DT = DecisionTreeClassifier(max_depth=50, min_samples_leaf=10, min_samples_split=1000,class_weight='balanced')
besthyperpara_bow_DT.fit(bow_X_train_no_stop_40k,y_train_40k)
pred_proba_train_bow_DT=(besthyperpara_bow_DT.predict_proba(bow_X_train_no_stop_40k)[:,1])
pred_proba_test_bow_DT=(besthyperpara_bow_DT.predict_proba(bow_X_test_no_stop_40k)[:,1])
roc_auc_test_bow_DT_= (roc_auc_score(y_test_40k,pred_proba_test_bow_DT))
roc_auc_train_bow_DT = (roc_auc_score(y_train_40k,pred_proba_train_bow_DT))
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
fpr_test_bow_DT, tpr_test_bow_DT, thresholds = roc_curve(y_test_40k, pred_proba_test_bow_DT)
fpr_train_bow_DT, tpr_train_bow_DT, thresholds = roc_curve(y_train_40k, pred_proba_train_bow_DT)
# create plot
default_dpi = plt.rcParamsDefault['figure.dpi']
plt.rcParams['figure.dpi'] = default_dpi*1.1
plt.plot(fpr_test_bow_DT, tpr_test_bow_DT, label=' Test ROC curve on Review Text')
plt.scatter(fpr_test_bow_DT, tpr_test_bow_DT, label=' Test ROC curve on Review Text')
plt.plot(fpr_train_bow_DT, tpr_train_bow_DT, label=' Train ROC curve on Review Text')
plt.scatter(fpr_train_bow_DT, tpr_train_bow_DT, label=' Train ROC curveon Review Text')
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.minorticks_on()
plt.grid(b=True, which='both', color='0.65', linestyle='-')
_ = plt.xlabel('False Positive Rate')
_ = plt.ylabel('True Positive Rate')
_ = plt.title('ROC Curve')
_ = plt.xlim([-0.02, 1])
_ = plt.ylim([0, 1.02])
_ = plt.legend(loc="lower right")
from sklearn.metrics import roc_auc_score
predict_DT_BOW_train = (grid_DT_BOW.predict(bow_X_train_no_stop_40k))
predict_DT_BOW_test = grid_DT_BOW.predict(bow_X_test_no_stop_40k)
roc_auc_DT_BOW_train = roc_auc_score(y_test_40k, predict_DT_BOW_test)
roc_auc_DT_BOW_test = roc_auc_score(y_train_40k, predict_DT_BOW_train)
from sklearn.metrics import classification_report
print ("#######################################################")
print ("The classification report on Test dataset on Review Text")
print ("#######################################################")
print(classification_report(y_test_40k, predict_DT_BOW_test))
print ("#######################################################")
print ("The classification report on Training dataset Review Text")
print ("#######################################################")
print(classification_report(y_train_40k, predict_DT_BOW_train))
from sklearn.metrics import confusion_matrix
import scikitplot.metrics as skplt
default_dpi = plt.rcParamsDefault['figure.dpi']
plt.rcParams['figure.dpi'] = default_dpi*.63
skplt.plot_confusion_matrix(y_test_40k, predict_DT_BOW_test,normalize=True)
print ("The first matrix is that of Test in normalized format")
print ("The second matrix is that of Train in normalized format")
print ("The third matrix is that of Test in non normalized format")
print ("The fourth matrix is that of Train in non normalized format")
skplt.plot_confusion_matrix(y_train_40k, predict_DT_BOW_train,normalize=True)
skplt.plot_confusion_matrix(y_test_40k, predict_DT_BOW_test)
skplt.plot_confusion_matrix(y_train_40k, predict_DT_BOW_train)
# https://github.com/Manish-12/Decision-Tree-on-Amazon-fine-food-reviews/blob/master/Decision_tree.ipynb
top_20_feature_bow = besthyperpara_bow_DT.feature_importances_.argsort()[::-1][:20]
below_20_feature_bow = besthyperpara_bow_DT.feature_importances_.argsort()[::-1][20:]
print(np.take(bow_vect.get_feature_names(),top_20_feature_bow))
print(np.take(bow_vect.get_feature_names(),below_20_feature_bow))
Lets reduce the depth till 5 for Graphviz
besthyperpara_bow_DT_depth5 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10, min_samples_split=1000,class_weight='balanced')
besthyperpara_bow_DT_depth5.fit(bow_X_train_no_stop_40k,y_train_40k)
#https://pythonprogramminglanguage.com/decision-tree-visual-example/
from IPython.display import Image
from sklearn.tree import export_graphviz
import graphviz
dot_data = tree.export_graphviz(besthyperpara_bow_DT_depth5,feature_names =bow_vect.get_feature_names() ,out_file='tree_nonlimited.dot')
from IPython.display import Image
Image(filename = r'C:\Users\Prateek Saurabh\AppliedAI\Homework and assignments\tree_nonlimited.png')
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2),min_df = 7,max_features=5000)
tfidf_X_train = tf_idf_vect.fit_transform(X_no_stop_train_40k)
tfidf_X_test = tf_idf_vect.transform(X_no_stop_test_40k)
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, grid_search
from sklearn.grid_search import GridSearchCV
param_grid = {'max_depth':[1, 5, 10, 50, 100, 500, 1000],'min_samples_split':[5, 10, 100, 500,1000]}
grid_DT_tfidf = GridSearchCV(DecisionTreeClassifier(class_weight='balanced'),param_grid,scoring='roc_auc',cv=3, verbose=2)
grid_DT_tfidf.fit(tfidf_X_train,y_train_40k)
grid_DT_tfidf.best_params_
besthyperpara_tfidf_DT = DecisionTreeClassifier(max_depth=50, min_samples_leaf=10, min_samples_split=1000,class_weight='balanced')
besthyperpara_tfidf_DT.fit(tfidf_X_train,y_train_40k)
pred_proba_train_tfidf_DT=(besthyperpara_tfidf_DT.predict_proba(tfidf_X_train)[:,1])
pred_proba_test_tfidf_DT=(besthyperpara_tfidf_DT.predict_proba(tfidf_X_test)[:,1])
roc_auc_test_tfidf_DT_= (roc_auc_score(y_test_40k,pred_proba_test_tfidf_DT))
roc_auc_train_tfidf_DT = (roc_auc_score(y_train_40k,pred_proba_train_tfidf_DT))
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
fpr_test_tfidf_DT, tpr_test_tfidf_DT, thresholds = roc_curve(y_test_40k,pred_proba_test_tfidf_DT)
fpr_train_tfidf_DT, tpr_train_tfidf_DT, thresholds = roc_curve(y_train_40k,pred_proba_train_tfidf_DT)
# create plot
default_dpi = plt.rcParamsDefault['figure.dpi']
plt.rcParams['figure.dpi'] = default_dpi*1.1
plt.plot(fpr_test_tfidf_DT, tpr_test_tfidf_DT, label=' Test ROC curve on Review Text')
plt.scatter(fpr_test_tfidf_DT, tpr_test_tfidf_DT, label=' Test ROC curve on Review Text')
plt.plot(fpr_train_tfidf_DT, tpr_train_tfidf_DT, label=' Train ROC curve on Review Text')
plt.scatter(fpr_train_tfidf_DT, tpr_train_tfidf_DT, label=' Train ROC curveon Review Text')
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.minorticks_on()
plt.grid(b=True, which='both', color='0.65', linestyle='-')
_ = plt.xlabel('False Positive Rate')
_ = plt.ylabel('True Positive Rate')
_ = plt.title('ROC Curve')
_ = plt.xlim([-0.02, 1])
_ = plt.ylim([0, 1.02])
_ = plt.legend(loc="lower right")
from sklearn.metrics import roc_auc_score
predict_DT_tfidf_train = grid_DT_tfidf.predict(tfidf_X_train)
predict_DT_tfidf_test = grid_DT_tfidf.predict(tfidf_X_test)
roc_auc_DT_tfidf_train = roc_auc_score(y_test_40k, predict_DT_tfidf_test)
roc_auc_DT_tfidf_test = roc_auc_score(y_train_40k, predict_DT_tfidf_train)
from sklearn.metrics import classification_report
print ("#######################################################")
print ("The classification report on Test dataset on Review Text")
print ("#######################################################")
print(classification_report(y_test_40k, predict_DT_tfidf_test))
print ("#######################################################")
print ("The classification report on Training dataset Review Text")
print ("#######################################################")
print(classification_report(y_train_40k, predict_DT_tfidf_train))
from sklearn.metrics import confusion_matrix
import scikitplot.metrics as skplt
default_dpi = plt.rcParamsDefault['figure.dpi']
plt.rcParams['figure.dpi'] = default_dpi*.63
skplt.plot_confusion_matrix(y_test_40k, predict_DT_tfidf_test,normalize=True)
print ("The first matrix is that of Test in normalized format")
print ("The second matrix is that of Train in normalized format")
print ("The third matrix is that of Test in non normalized format")
print ("The fourth matrix is that of Train in non normalized format")
skplt.plot_confusion_matrix(y_train_40k, predict_DT_tfidf_train,normalize=True)
skplt.plot_confusion_matrix(y_test_40k, predict_DT_tfidf_test)
skplt.plot_confusion_matrix(y_train_40k, predict_DT_tfidf_train)
# https://github.com/Manish-12/Decision-Tree-on-Amazon-fine-food-reviews/blob/master/Decision_tree.ipynb
top_20_feature_tfidf = besthyperpara_tfidf_DT.feature_importances_.argsort()[::-1][:20]
below_20_feature_tfidf = besthyperpara_tfidf_DT.feature_importances_.argsort()[::-1][20:]
print(np.take(tf_idf_vect.get_feature_names(),top_20_feature_tfidf))
print(np.take(tf_idf_vect.get_feature_names(),below_20_feature_tfidf))
besthyperpara_tfidf_DT_depth5 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10, min_samples_split=1000,class_weight='balanced')
besthyperpara_tfidf_DT_depth5.fit(tfidf_X_train,y_train_40k)
#https://pythonprogramminglanguage.com/decision-tree-visual-example/
from IPython.display import Image
from sklearn.tree import export_graphviz
import graphviz
dot_data = tree.export_graphviz(besthyperpara_tfidf_DT_depth5,feature_names =tf_idf_vect.get_feature_names() ,out_file='tree_tfidf.dot')
from IPython.display import Image
Image(filename = r'C:\Users\Prateek Saurabh\AppliedAI\Homework and assignments\tree_tfidf.png')
lst_train=[]
lst_test=[]
lst_of_lst_train = []
lst_of_lst_test = []
for sentance in tqdm(X_no_stop_train_40k):
lst_train.append(sentance.strip())
for sentance in tqdm(lst_train):
lst_of_lst_train.append(sentance.split())
for sent in tqdm(X_no_stop_test_40k):
lst_test.append(sent.strip())
for sent in tqdm(lst_test):
lst_of_lst_test.append(sent.split())
w2v_model_self_taught_train=Word2Vec(lst_of_lst_train,min_count=1,size=50, workers=4)
w2v_words_train = list(w2v_model_self_taught_train.wv.vocab)
sent_vectors_train = []
for sent1 in tqdm(lst_of_lst_train): # for each review/sentence
sent_vec1 = np.zeros(50)
cnt_words1 = 0
for word1 in sent1:
if word1 in w2v_words_train:
vec1 = w2v_model_self_taught_train.wv[word1]
sent_vec1 += vec1
cnt_words1 += 1
if cnt_words1 != 0:
sent_vec1 /= cnt_words1
sent_vectors_train.append(sent_vec1)
sent_vectors_test = []
for sent2 in tqdm(lst_of_lst_test): # for each review/sentence
sent_vec2 = np.zeros(50)
cnt_words2 = 0
for word2 in sent2:
if word2 in w2v_words_train:
vec2 = w2v_model_self_taught_train.wv[word2]
sent_vec2 += vec2
cnt_words2 += 1
if cnt_words2 != 0:
sent_vec2 /= cnt_words2
sent_vectors_test.append(sent_vec2)
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, grid_search
from sklearn.grid_search import GridSearchCV
param_grid = {'max_depth':[1, 5, 10, 50, 100, 500, 1000],'min_samples_split':[5, 10, 100, 500,1000]}
grid_DT_avgw2v = GridSearchCV(DecisionTreeClassifier(class_weight='balanced'),param_grid,scoring='roc_auc',cv=3, verbose=2)
grid_DT_avgw2v.fit(sent_vectors_train,y_train_40k)
grid_DT_avgw2v.best_params_
besthyperpara_avgw2v_DT = DecisionTreeClassifier(max_depth=100,min_samples_split=500,class_weight='balanced')
besthyperpara_avgw2v_DT.fit(sent_vectors_train,y_train_40k)
pred_proba_train_avgw2v_DT=(besthyperpara_avgw2v_DT.predict_proba(sent_vectors_train)[:,1])
pred_proba_test_avgw2v_DT=(besthyperpara_avgw2v_DT.predict_proba(sent_vectors_test)[:,1])
roc_auc_test_avgw2v_DT_= (roc_auc_score(y_test_40k,pred_proba_test_avgw2v_DT))
roc_auc_train_avgw2v_DT = (roc_auc_score(y_train_40k,pred_proba_train_avgw2v_DT))
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
fpr_test_avgw2v_DT, tpr_test_avgw2v_DT, thresholds = roc_curve(y_test_40k,pred_proba_test_avgw2v_DT)
fpr_train_avgw2v_DT, tpr_train_avgw2v_DT, thresholds = roc_curve(y_train_40k,pred_proba_train_avgw2v_DT)
# create plot
default_dpi = plt.rcParamsDefault['figure.dpi']
plt.rcParams['figure.dpi'] = default_dpi*1.1
plt.plot(fpr_test_avgw2v_DT, tpr_test_avgw2v_DT, label=' Test ROC curve on Review Text')
plt.scatter(fpr_test_avgw2v_DT, tpr_test_avgw2v_DT, label=' Test ROC curve on Review Text')
plt.plot(fpr_train_avgw2v_DT, tpr_train_avgw2v_DT, label=' Train ROC curve on Review Text')
plt.scatter(fpr_train_avgw2v_DT, tpr_train_avgw2v_DT, label=' Train ROC curveon Review Text')
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.minorticks_on()
plt.grid(b=True, which='both', color='0.65', linestyle='-')
_ = plt.xlabel('False Positive Rate')
_ = plt.ylabel('True Positive Rate')
_ = plt.title('ROC Curve')
_ = plt.xlim([-0.02, 1])
_ = plt.ylim([0, 1.02])
_ = plt.legend(loc="lower right")
from sklearn.metrics import roc_auc_score
predict_DT_avgw2v_train = grid_DT_avgw2v.predict(sent_vectors_train)
predict_DT_avgw2v_test = grid_DT_avgw2v.predict(sent_vectors_test)
from sklearn.metrics import classification_report
print ("#######################################################")
print ("The classification report on Test dataset on Review Text")
print ("#######################################################")
print(classification_report(y_test_40k, predict_DT_avgw2v_test))
print ("#######################################################")
print ("The classification report on Training dataset Review Text")
print ("#######################################################")
print(classification_report(y_train_40k, predict_DT_avgw2v_train))
roc_auc_DT_avgw2v_train = roc_auc_score(y_test_40k, predict_DT_avgw2v_test)
roc_auc_DT_avgw2v_test = roc_auc_score(y_train_40k, predict_DT_avgw2v_train)
from sklearn.metrics import confusion_matrix
import scikitplot.metrics as skplt
default_dpi = plt.rcParamsDefault['figure.dpi']
plt.rcParams['figure.dpi'] = default_dpi*.63
skplt.plot_confusion_matrix(y_test_40k, predict_DT_avgw2v_test,normalize=True)
print ("The first matrix is that of Test in normalized format")
print ("The second matrix is that of Train in normalized format")
print ("The third matrix is that of Test in non normalized format")
print ("The fourth matrix is that of Train in non normalized format")
skplt.plot_confusion_matrix(y_train_40k, predict_DT_avgw2v_train,normalize=True)
skplt.plot_confusion_matrix(y_test_40k, predict_DT_avgw2v_test)
skplt.plot_confusion_matrix(y_train_40k, predict_DT_avgw2v_train)
model_tfidfw2v = TfidfVectorizer()
model_tfidfw2v.fit(X_no_stop_train_40k)
dictionary = dict(zip(model_tfidfw2v.get_feature_names(), list(model_tfidfw2v.idf_)))
tfidf_feat_tfidfw2v = model_tfidfw2v.get_feature_names()
tfidf_w2v_sent_vectors_train = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent4 in tqdm(lst_of_lst_train): # for each review/sentence
sent_vec4 = np.zeros(50) # as word vectors are of zero length
weight_sum4 =0; # num of words with a valid vector in the sentence/review
for word4 in sent4: # for each word in a review/sentence
if word4 in w2v_words_train and word4 in tfidf_feat_tfidfw2v:
vec4 = w2v_model_self_taught_train.wv[word4]
tf_idf_train = dictionary[word4]*(sent4.count(word4)/len(sent4))
sent_vec4 += (vec4 * tf_idf_train)
weight_sum4 += tf_idf_train
if weight_sum4 != 0:
sent_vec4 /= weight_sum4
tfidf_w2v_sent_vectors_train.append(sent_vec4)
row += 1
tfidf_w2v_sent_vectors_test = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent5 in tqdm(lst_of_lst_test): # for each review/sentence
sent_vec5 = np.zeros(50) # as word vectors are of zero length
weight_sum5 =0; # num of words with a valid vector in the sentence/review
for word5 in sent5: # for each word in a review/sentence
if word5 in w2v_words_train and word5 in tfidf_feat_tfidfw2v:
vec5 = w2v_model_self_taught_train.wv[word5]
tf_idf_test = dictionary[word5]*(sent5.count(word5)/len(sent5))
sent_vec5 += (vec5 * tf_idf_test)
weight_sum5 += tf_idf_test
if weight_sum5 != 0:
sent_vec5 /= weight_sum5
tfidf_w2v_sent_vectors_test.append(sent_vec5)
row += 1
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, grid_search
from sklearn.grid_search import GridSearchCV
param_grid = {'max_depth':[1, 5, 10, 50, 100, 500, 1000],'min_samples_split':[5, 10, 100, 500,1000]}
grid_DT_tfidfw2v = GridSearchCV(DecisionTreeClassifier(class_weight='balanced'),param_grid,scoring='roc_auc',cv=3, verbose=2)
grid_DT_tfidfw2v.fit(tfidf_w2v_sent_vectors_train,y_train_40k)
grid_DT_tfidfw2v.best_params_
besthyperpara_tfidfw2v_DT = DecisionTreeClassifier(max_depth=500,min_samples_split=500,class_weight='balanced')
besthyperpara_tfidfw2v_DT.fit(tfidf_w2v_sent_vectors_train,y_train_40k)
pred_proba_train_tfidfw2v_DT=(besthyperpara_tfidfw2v_DT.predict_proba(tfidf_w2v_sent_vectors_train)[:,1])
pred_proba_test_tfidfw2v_DT=(besthyperpara_tfidfw2v_DT.predict_proba(tfidf_w2v_sent_vectors_test)[:,1])
roc_auc_test_tfidfw2v_DT_= (roc_auc_score(y_test_40k,pred_proba_test_tfidfw2v_DT))
roc_auc_train_tfidfw2v_DT = (roc_auc_score(y_train_40k,pred_proba_train_tfidfw2v_DT))
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
%matplotlib inline
fpr_test_tfidfw2v_DT, tpr_test_tfidfw2v_DT, thresholds = roc_curve(y_test_40k,pred_proba_test_tfidfw2v_DT)
fpr_train_tfidfw2v_DT, tpr_train_tfidfw2v_DT, thresholds = roc_curve(y_train_40k,pred_proba_train_tfidfw2v_DT)
# create plot
default_dpi = plt.rcParamsDefault['figure.dpi']
plt.rcParams['figure.dpi'] = default_dpi*1.1
plt.plot(fpr_test_tfidfw2v_DT, tpr_test_tfidfw2v_DT, label=' Test ROC curve on Review Text')
plt.scatter(fpr_test_tfidfw2v_DT, tpr_test_tfidfw2v_DT, label=' Test ROC curve on Review Text')
plt.plot(fpr_train_tfidfw2v_DT, tpr_train_tfidfw2v_DT, label=' Train ROC curve on Review Text')
plt.scatter(fpr_train_tfidfw2v_DT, tpr_train_tfidfw2v_DT, label=' Train ROC curveon Review Text')
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.minorticks_on()
plt.grid(b=True, which='both', color='0.65', linestyle='-')
_ = plt.xlabel('False Positive Rate')
_ = plt.ylabel('True Positive Rate')
_ = plt.title('ROC Curve')
_ = plt.xlim([-0.02, 1])
_ = plt.ylim([0, 1.02])
_ = plt.legend(loc="lower right")
from sklearn.metrics import roc_auc_score
predict_DT_tfidfw2v_train = grid_DT_avgw2v.predict(tfidf_w2v_sent_vectors_train)
predict_DT_tfidfw2v_test = grid_DT_avgw2v.predict(tfidf_w2v_sent_vectors_test)
from sklearn.metrics import classification_report
print ("#######################################################")
print ("The classification report on Test dataset on Review Text")
print ("#######################################################")
print(classification_report(y_test_40k, predict_DT_tfidfw2v_test))
print ("#######################################################")
print ("The classification report on Training dataset Review Text")
print ("#######################################################")
print(classification_report(y_train_40k, predict_DT_tfidfw2v_train))
roc_auc_DT_tfidfw2v_train = roc_auc_score(y_test_40k, predict_DT_tfidfw2v_test)
roc_auc_DT_tfidfw2v_test = roc_auc_score(y_train_40k, predict_DT_tfidfw2v_train)
from sklearn.metrics import confusion_matrix
import scikitplot.metrics as skplt
default_dpi = plt.rcParamsDefault['figure.dpi']
plt.rcParams['figure.dpi'] = default_dpi*.63
skplt.plot_confusion_matrix(y_test_40k, predict_DT_tfidfw2v_test,normalize=True)
print ("The first matrix is that of Test in normalized format")
print ("The second matrix is that of Train in normalized format")
print ("The third matrix is that of Test in non normalized format")
print ("The fourth matrix is that of Train in non normalized format")
skplt.plot_confusion_matrix(y_train_40k, predict_DT_tfidfw2v_train,normalize=True)
skplt.plot_confusion_matrix(y_test_40k, predict_DT_tfidfw2v_test)
skplt.plot_confusion_matrix(y_train_40k, predict_DT_tfidfw2v_train)
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Algorithm","Hyper-Parameter", "AUC"]
x.add_row(["Decision Tree BOW Test", "'max_depth': 50, 'min_samples_split': 1000",roc_auc_DT_BOW_test])
x.add_row(["Decision Tree BOW Train", "'max_depth': 50, 'min_samples_split': 1000",roc_auc_DT_BOW_train])
x.add_row(["Decision Tree TFIDF Test", "'max_depth': 50, 'min_samples_split': 1000" ,roc_auc_DT_tfidf_test])
x.add_row(["Decision Tree TFIDF Train", "'max_depth': 50, 'min_samples_split': 1000" ,roc_auc_DT_tfidf_train])
x.add_row(["Decision Tree AVGW2V Test","'max_depth': 100, 'min_samples_split': 500", roc_auc_DT_avgw2v_test])
x.add_row(["Decision Tree AVGW2V Train","'max_depth': 100, 'min_samples_split': 500",roc_auc_DT_avgw2v_train])
x.add_row(["Decision Tree TFIDF-W2V Test","'max_depth': 500, 'min_samples_split': 500", roc_auc_DT_tfidfw2v_test])
x.add_row(["Decision Tree TFIDF-W2V Train","'max_depth': 500, 'min_samples_split': 500",roc_auc_DT_tfidfw2v_train ])
print (x)